{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import datetime\n",
    "import pandas as pd\n",
    "from sklearn import preprocessing\n",
    "\n",
    "tr_csv_path = 'data/train.csv'\n",
    "ts_csv_path = 'data/test.csv'\n",
    "\n",
    "data_type = {'id': 'U', 'hour': 'U', 'device_type':'U', 'C1':'U', 'C15':'U', 'C16':'U'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40428967, 23)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(tr_csv_path, dtype=data_type, index_col='id')\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4577464, 23)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test  = pd.read_csv(ts_csv_path, dtype=data_type, index_col='id')\n",
    "test.insert(0, 'click', 0)\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts = pd.concat([test, train], copy=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 一、 统计类别型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1. site_id 处理频率低于20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "site_id_count = tr_ts.site_id.value_counts()\n",
    "site_id_category={}\n",
    "site_id_category[0] = site_id_count.loc[site_id_count>20].index.values\n",
    "site_id_category[1] = site_id_count.loc[site_id_count<=20].index.values\n",
    "\n",
    "site_id_C_type_dict = {}\n",
    "for key, values in site_id_category.items():\n",
    "    for item in values:\n",
    "        site_id_C_type_dict[str(item)] = key\n",
    "\n",
    "json.dump(site_id_C_type_dict, open(\"data/site_id_C_type_dict.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2. site_domain 处理频率低于20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "site_domain_count = tr_ts.site_domain.value_counts()\n",
    "site_domain_category={}\n",
    "site_domain_category[0] = site_domain_count.loc[site_domain_count>20].index.values\n",
    "site_domain_category[1] = site_domain_count.loc[site_domain_count<=20].index.values\n",
    "\n",
    "site_domain_C_type_dict = {}\n",
    "for key, values in site_domain_category.items():\n",
    "    for item in values:\n",
    "        site_domain_C_type_dict[str(item)] = key\n",
    "\n",
    "json.dump(site_domain_C_type_dict, open(\"data/site_domain_C_type_dict.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3. app_id 处理频率低于20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "app_id_count = tr_ts.app_id.value_counts()\n",
    "app_id_category={}\n",
    "app_id_category[0] = app_id_count.loc[app_id_count>20].index.values\n",
    "app_id_category[1] = app_id_count.loc[app_id_count<=20].index.values\n",
    "\n",
    "app_id_C_type_dict = {}\n",
    "for key, values in app_id_category.items():\n",
    "    for item in values:\n",
    "        app_id_C_type_dict[str(item)] = key\n",
    "\n",
    "json.dump(app_id_C_type_dict, open(\"data/app_id_C_type_dict.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4. device_model 处理频率低于200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "device_model_count = tr_ts.device_model.value_counts()\n",
    "device_model_category={}\n",
    "device_model_category[0] = device_model_count.loc[device_model_count>200].index.values\n",
    "device_model_category[1] = device_model_count.loc[device_model_count<=200].index.values\n",
    "\n",
    "device_model_C_type_dict = {}\n",
    "for key, values in device_model_category.items():\n",
    "    for item in values:\n",
    "        device_model_C_type_dict[str(item)] = key\n",
    "\n",
    "json.dump(device_model_C_type_dict, open(\"data/device_model_C_type_dict.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二、 处理类别型数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts['hour'] = tr_ts['hour'].apply(lambda x: x[-2:])\n",
    "tr_ts['is_device'] = tr_ts['device_id'].apply(lambda x: 0 if x=='a99f214a' else 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "app_id_C_type_dict = json.load(open(\"data/app_id_C_type_dict.json\", \"r\"))\n",
    "site_id_C_type_dict = json.load(open(\"data/site_id_C_type_dict.json\", \"r\"))\n",
    "site_domain_C_type_dict = json.load(open(\"data/site_domain_C_type_dict.json\", \"r\"))\n",
    "device_model_C_type_dict = json.load(open(\"data/device_model_C_type_dict.json\", \"r\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts['C_app_id'] = tr_ts[\"app_id\"].apply(lambda x: x if app_id_C_type_dict.get(x)==0 else \"other_app_id\")\n",
    "tr_ts['C_site_id'] = tr_ts['site_id'].apply(lambda x: x if site_id_C_type_dict.get(x)==0 else \"other_site_id\")\n",
    "tr_ts['C_site_domain'] = tr_ts['site_domain'].apply(lambda x: x if site_domain_C_type_dict.get(x)==0 else \"other_site_domain\")\n",
    "tr_ts['C_device_model'] = tr_ts['device_model'].apply(lambda x: x if device_model_C_type_dict.get(x)==0 else \"other_device_model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts[\"C_pix\"] = tr_ts[\"C15\"] + '&' + tr_ts[\"C16\"]\n",
    "tr_ts[\"C_device_type_1\"] = tr_ts[\"device_type\"] + '&' + tr_ts[\"C1\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts.drop(['device_id', \"device_type\", 'app_id', 'site_id', 'site_domain', 'device_model',\"C1\", \"C17\", 'C15', 'C16'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "convert hour...\n",
      "convert banner_pos...\n",
      "convert site_category...\n",
      "convert app_domain...\n",
      "convert app_category...\n",
      "convert device_conn_type...\n",
      "convert C14...\n",
      "convert C18...\n",
      "convert C19...\n",
      "convert C20...\n",
      "convert C21...\n",
      "convert is_device...\n",
      "convert C_app_id...\n",
      "convert C_site_id...\n",
      "convert C_site_domain...\n",
      "convert C_device_model...\n",
      "convert C_pix...\n",
      "convert C_device_type_1...\n"
     ]
    }
   ],
   "source": [
    "lenc = preprocessing.LabelEncoder()\n",
    "C_fields = [ 'hour', 'banner_pos', 'site_category', 'app_domain', 'app_category',\n",
    "            'device_conn_type', 'C14', 'C18', 'C19', 'C20','C21', 'is_device', 'C_app_id', 'C_site_id', \n",
    "            'C_site_domain', 'C_device_model', 'C_pix', 'C_device_type_1']\n",
    "for f, column in enumerate(C_fields):\n",
    "    print(\"convert \" + column + \"...\")\n",
    "    tr_ts[column] = lenc.fit_transform(tr_ts[column])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr_ts.iloc[:test.shape[0],].to_csv('data/test_FE.csv')\n",
    "tr_ts.iloc[test.shape[0]:,].to_csv('data/train_FE.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['click', 'hour', 'banner_pos', 'site_category', 'app_domain',\n",
       "       'app_category', 'device_ip', 'device_conn_type', 'C14', 'C18', 'C19',\n",
       "       'C20', 'C21', 'is_device', 'C_app_id', 'C_site_id', 'C_site_domain',\n",
       "       'C_device_model', 'C_pix', 'C_device_type_1'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tr_ts.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
